library(tidyverse)
library(caret)
library(lattice)
library(DataExplorer)
# Get barstool data off github
job_post <- read_csv("https://raw.githubusercontent.com/luizmalpele/datasets/master/fake_job_postings.csv")
Warning: closing unused connection 3 (https://raw.githubusercontent.com/luizmalpele/StatsLearningProject/master/data/fake_job_postings.csv)
Parsed with column specification:
cols(
job_id = [32mcol_double()[39m,
title = [31mcol_character()[39m,
location = [31mcol_character()[39m,
department = [31mcol_character()[39m,
salary_range = [31mcol_character()[39m,
company_profile = [31mcol_character()[39m,
description = [31mcol_character()[39m,
requirements = [31mcol_character()[39m,
benefits = [31mcol_character()[39m,
telecommuting = [32mcol_double()[39m,
has_company_logo = [32mcol_double()[39m,
has_questions = [32mcol_double()[39m,
employment_type = [31mcol_character()[39m,
required_experience = [31mcol_character()[39m,
required_education = [31mcol_character()[39m,
industry = [31mcol_character()[39m,
`function` = [31mcol_character()[39m,
fraudulent = [32mcol_double()[39m
)
job_post
plot_missing(job_post)
job_post %>%
select(fraudulent, department, required_education, benefits, required_experience,salary_range, location, requirements, company_profile, employment_type, industry) %>%
group_by(fraudulent) %>%
summarize(na_ratio_salary = sum(is.na(salary_range))/length(salary_range),
na_ratio_department = sum(is.na(department))/length(department),
na_ratio_required_education = sum(is.na(required_education))/length(required_education),
na_ratio_benefits = sum(is.na(benefits))/length(benefits),
na_ratio_requirements = sum(is.na(requirements))/length(requirements),
na_ratio_company_profile = sum(is.na(company_profile))/length(company_profile),
na_ratio_location = sum(is.na(location))/length(location),
na_ratio_employment_type = sum(is.na(employment_type))/length(employment_type),
na_ratio_industry = sum(is.na(industry))/length(industry)
)
The variables that presented higher missing information ratio are: company_profile and employment_type.
job_post %>% group_by(fraudulent) %>%
summarize(ratio_has_questions = sum(has_questions)/length(has_questions),
ratio_has_company_logo = sum(has_company_logo)/length(has_company_logo),
ratio_telecommuting = sum(telecommuting)/length(telecommuting))
The variables that presented higher missing information ratio are: ratio_has_questions and ratio_has_company_logo. The next step is to investigate the titles and decriptions using data mining and text mining tecniques.